# For California files, handle duplicate records for UAPN within a county ZTRAX file.

pacman::p_load(data.table, dplyr)

rm(list = ls())

source("code/globals.R")

loadcleandamage <- function(fname) {
  y <- read_fst(file.path(WORKING, paste0(fname, "_cleaned.fst")), as.data.table = T)
  stopifnot(anyDuplicated(y, by = c("FIPS", "UnformattedAssessorParcelNumber", "INCIDENTNUM")) == 0)
  return(y)
}

loadcleanattr <- function(fname) {
  y <- read_fst(file.path(WORKING, paste0(fname, "_cleaned.fst")), as.data.table = T)
  stopifnot(anyDuplicated(y, by = c("ImportParcelID", "BuildingOrImprovementNumber")) == 0)
  return(y)
}

allfiles <- data.frame(character(), character()) %>%
  rbind(c("dins13", "attr13")) %>%
  rbind(c("dins15", "attr15")) %>%
  rbind(c("dins17", "attr16")) %>% ## this one is tricky - i use 17 for damage and 16 for attri..
  rbind(c("dins18", "attr18")) %>%
  rbind(c("damage03_SanDiego", "attr03_SanDiego")) %>%
  rbind(c("damage07_SanDiego", "attr07_SanDiego")) %>%
  rbind(c("damage_ca_07", "attr_ca_07")) %>%
  rbind(c("damage_ca_08", "attr_ca_08")) %>%
  rbind(c("damage_ca_09", "attr_ca_09")) %>%
  rbind(c("damage_ca_12", "attr_ca_12")) %>%
  rename(damage = 1, attribute = 2)

## Which files does this affect? (In which ZTRAX files are FIPS+UAPN not a unique id after inner merge to damage data?).
#- Seems to be just the 2018 data.
for (p in seq_len(nrow(allfiles))) {
  damagefile <- loadcleandamage(allfiles$damage[p])
  attrfile <- loadcleanattr(allfiles$attribute[p])
  a <- merge(attrfile, damagefile, by = c("UnformattedAssessorParcelNumber", "FIPS"))
  print(allfiles$attribute[p])
  print(anyDuplicated(a, by = c("UnformattedAssessorParcelNumber", "FIPS")) == 0)
  rm(a, damagefile, attrfile)
}

## --------------------------- Handle these duplicates -----------------------------#
## - I apply a heuristic to identify the "main" ZTRAX structure on each parcel that is reported as destroyed in the damage data (which is assumedly what the damage inspectors were considering).
## - This heuristic orders structures by number of bedrooms, and then square footage, and then Tax Amount, then breaks any remaining ties randomly.
## - This give highly similar results to just choosing "buildingorimprovementnumber" equal to 1 in all cases.
## Also, in most of these cases ALL the structures on the parcel are destroyed anyway.

handle_ztrax_APN_dupes <- function(damagefile, attrfile) {
  a <- merge(attrfile, damagefile, by = c("UnformattedAssessorParcelNumber", "FIPS"))
  a[, Multiple := .N > 1, by = .(UnformattedAssessorParcelNumber, FIPS)]

  b <- a[Multiple == TRUE, ] %>%
    dplyr::select(
      FIPS, ImportParcelID, BuildingOrImprovementNumber, UnformattedAssessorParcelNumber, sqfeet, LotSizeAcres,
      TaxAmount, YearBuilt, TotalBedrooms, PropertyLandUseStndCode, PropertyFullStreetAddress
    ) %>%
    dplyr::mutate(sqfeet = replace(sqfeet, is.na(sqfeet), 0)) %>% # replace NA values with 0 so they don't sort to the bottom in each group (important below when sorting desired obs to end)
    dplyr::mutate(TotalBedrooms = replace(TotalBedrooms, is.na(TotalBedrooms), 0)) %>%
    dplyr::mutate(TaxAmount = replace(TaxAmount, is.na(TaxAmount), 0)) %>%
    dplyr::mutate(rand = runif(n())) %>% # random tiebreaker. Note this is reproducible because we set seed in globals.R (which gets called at top of script)
    dplyr::arrange(FIPS, UnformattedAssessorParcelNumber, TotalBedrooms, sqfeet, TaxAmount, rand) %>%
    group_by(FIPS, UnformattedAssessorParcelNumber) %>%
    dplyr::mutate(copy = row_number(), numcopies = n()) %>%
    ungroup()

  #-- make a list of rows to be deleted (all except the LAST row in each sorted group)
  b <- b %>%
    dplyr::filter(copy != numcopies) %>%
    dplyr::select(ImportParcelID, BuildingOrImprovementNumber) %>%
    dplyr::mutate(tempconcat = paste0(ImportParcelID, "---j---", BuildingOrImprovementNumber))

  ## -- Remove duplicates from attribute data
  oldrows <- nrow(attrfile)
  attrfile <- attrfile %>%
    dplyr::mutate(tempconcat = paste0(ImportParcelID, "---j---", BuildingOrImprovementNumber)) %>%
    dplyr::filter(!(tempconcat %in% b$tempconcat)) %>%
    dplyr::mutate(tempconcat = NULL)

  print(paste0(oldrows - nrow(attrfile), " rows deleted"))
  return(attrfile)
}

## -- Apply this function where needed, and re-save adjusted data
dins18 <- loadcleandamage("dins18")
attr18 <- loadcleanattr("attr18")
attr18 <- handle_ztrax_APN_dupes(dins18, attr18)
write_fst(attr18, file.path(WORKING, "attr18_cleaned.fst"))
rm(attr18, dins18)
